Dimitrios Karamanis
University of Piraeus
dkaramanis@hotmail.com
%matplotlib inline
from ipystata.config import config_stata
from IPython.display import Image
from IPython.core.display import HTML
config_stata('C:\Program Files\Stata14\StataSE-64.exe')
ITEMS:
Let's see some examples:
During WWII, the Navy tried to determine where they needed to armor their aircraft to ensure they came back home. They ran an analysis of where planes had been shot up, and came up with this. Obviously the places that needed to be up-armored are the wingtips, the central body, and the elevators. That’s where the planes were all getting shot up. Abraham Wald, a statistician, disagreed. He thought they should better armor the nose area, engines, and mid-body. Which was crazy, of course. That’s not where the planes were getting shot. Except Mr. Wald realized what the others didn’t. The planes were getting shot there too, but they weren’t making it home. What the Navy thought it had done was analyze where aircraft were suffering the most damage. What they had actually done was analyze where aircraft could suffer the most damage without catastrophic failure. All of the places that weren’t hit? Those planes had been shot there and crashed. They weren’t looking at the whole sample set, only the survivors.
Image(filename='airplane.PNG')
Image(filename='cholera.PNG')
There are several types of graphs, here we will present some of them.
Image(filename='typesgraphs.PNG')
The “classic” types of misleading graphs include cases where:
Image(filename='MisleadingGraphs/1.JPG')
Image(filename='MisleadingGraphs/2.JPG')
Image(filename='MisleadingGraphs/3.JPG')
Image(filename='MisleadingGraphs/4.JPG')
Image(filename='MisleadingGraphs/5.JPG')
%%stata
sysuse uslifeexp.dta, clear
describe
summarize
%%stata
sysuse lifeexp.dta, clear
describe
summarize
## see how many different countries do we have
codebook(country)
#we have 68 observations, 6 variables
See three different ways
%%stata
sysuse uslifeexp.dta, clear
graph twoway line le year
%%stata
sysuse uslifeexp.dta, clear
twoway line le year
%%stata
sysuse uslifeexp.dta, clear
line le year
%%stata
sysuse uslifeexp.dta, clear
twoway line le year, scheme(s1mono)
%%stata
sysuse uslifeexp.dta, clear
twoway line le year, scheme(economist)
%%stata
sysuse uslifeexp.dta, clear
twoway line le_wmale le_wfemale le_bmale le_bfemale year
%%stata
sysuse uslifeexp.dta, clear
twoway line le_wmale le_wfemale le_bmale le_bfemale year ///
, text(32 1920 "{bf:1918} {it:Influenza} Pandemic", place(3))
%%stata
sysuse lifeexp.dta, clear
scatter lexp safewater
%%stata
sysuse lifeexp.dta, clear
graph matrix lexp safewater gnppc popgrowth
%%stata
sysuse lifeexp.dta, clear
twoway scatter lexp safewater [w= gnppc] if region==2 /* North America */ ///
& gnppc ~=., msymbol(circle_hollow) || scatter lexp safewater if region==2 & gnppc ~=., ///
msymbol(none) mlabel(country) mlabposition(0) legend(off) ///
note("Note: Area of symbol proportional to country's GNP per capita")
%%stata
sysuse lifeexp.dta, clear
graph dot gnppc if region==2 , over(country) ytitle(GNP per capita in $)
%%stata
sysuse lifeexp.dta, clear
graph hbar lexp if region==2 , over(country , sort(1) ///
descending label(labsize(small))) ///
title("Life expectancy at birth" ,size(medium)) ///
ytitle("Age")
%%stata
sysuse lifeexp.dta, clear
graph box gnppc , over( region )
%%stata
cd C:\Users\Dimitris\Documents
use data.dta, clear
bysort country: sum subjective
%%stata
sort country
by country: egen subjective11=count(subjective) if subjective==1
by country: egen subjective22=count(subjective) if subjective==2
by country: egen subjective33=count(subjective) if subjective==3
by country: egen subjective44=count(subjective) if subjective==4
by country: egen subjective55=count(subjective) if subjective==5
graph bar subjective11 subjective22 subjective33 subjective44 subjective55, percentage over(country, label(angle(90))) ///
stack title("Subjective general health in EU countries") ytitle("%") ///
legend( label(1 "Very bad") label(2 "Bad") label(3 "Fair") label(4 "Good") label(5 "Very good") cols(5) symxsize(10) ) ///
bar(1,color(purple)) bar(2,color(red)) bar(3,color(orange)) bar(4,color(ebblue)) bar(5,color(green))
%%stata
use C:\Users\Dimitris\Documents\data.dta, clear
collapse (mean) subjective imm_stock imm_inflows , by(country)
encode country,gen(id)
**SCATTER & BAR
twoway (bar imm_stock id ) (scatter imm_inflows id, mfcolor(red) ytitle(% of population) ///
title(Stock & Inflows of foreigners per country, size(medium)) ///
xtitle("") xlabel(1(1)22, labsize(small) angle(vertical) valuelabel) ///
legend(ring(0) pos(2) order(1 "Stock" 2 "Inflows ") cols(1)) note(Source: OECD International Migration Database) )
%%stata
/*first install radar by typing: ssc install radar*/
sysuse auto.dta, clear
radar make turn mpg trunk if foreign, title(Radar graph) ///
lc(red blue green) lw(*1 *2 *4) r(0 12 14 18 50)
%%stata
use data2.dta, clear
gen country_code1=country_code if Open_gate_wall == "Gate"
gen country_code3=country_code if Open_gate_wall == "Wall"
gen country_code_GRC=country_code if country_code=="GRC"
gen country_code_high=country_code if RD>3
gen country_code_highFD=country_code if FD>0.9
separate RD, by(Open_gate_wall)
separate lngdp_pc , by(Open_gate_wall)
separate FD, by(Open_gate_wall)
separate ka, by(Open_gate_wall)
scatter RD lngdp_pc, msymbol(none) ms(i) mlabpos(c) mlabel( country_code3) mlabcolor(red) ///
|| scatter RD lngdp_pc, msymbol(none) ms(i) mlabpos(3) mlabel( country_code_high) mlabcolor(green) ///
|| scatter RD lngdp_pc, msymbol(none) ms(i) mlabpos(c) mlabel( country_code_GRC) mlabcolor(green) ///
|| scatter RD1 lngdp_pc [w=ka], ms(oh) mcol(gs12) msymbol(circle_hollow) mlcolor(blue) ytitle("R&D %GDP") xtitle("GDP per capita") legend(on) ///
title("R&D,GDP per Capita and Capital Controls") || scatter RD2 lngdp_pc [w=ka], ms(oh) mcol(gs12) msymbol(circle_hollow) mlcolor(green) legend(on) ///
|| scatter RD3 lngdp_pc [w=ka], ms(oh) mcol(gs12) msymbol(circle_hollow) mlcolor(red) note("Note: Area of symbol proportional to country's Capital Control level") ///
legend( ring(0) pos(2) col(1) order(6 4 5) label(4 "Gate") label(5 "Open") label(6 "Wall"))
%%stata
use data3.dta, clear
sort year
by year: egen mean_RD=mean(RD)
by year: egen mean_ka=mean(ka)
label variable mean_RD "R&D %GDP"
label variable mean_ka "Capital Controls"
twoway connected mean_ka year , msymbol(diamond) title("R&D as %GDP and Capital Controls through years", size(medium)) xtitle("year") yscale(alt axis(1)) ///
|| connected mean_RD year, yaxis(2) yscale(alt axis(2)) legend( ring(0) pos(5) col(1) label(2 "R&D %GDP") label(1 "Capital Controls") size(small)) ///
xtitle("Year") xsc(r(1996 2013)) xlabel(1996(2)2013 ,labsize(small)) note("NOTE: Capital controls range from 0 (open) to 1 (close)" ,justification(left) box)
%%stata
use data3.dta, clear
sort year
by year: egen mean_RD=mean(RD)
by year: egen mean_ka=mean(ka)
label variable mean_RD "R&D %GDP"
label variable mean_ka "Capital Controls"
line mean_ka mean_RD year , title("R&D as %GDP and Capital Controls through years", size(medium)) ///
xtitle("Year") legend(label(2 "R&D %GDP") label(1 "Capital Controls") ring(0) pos(5) col(1)) ///
xsc(r(1996 2013)) xlabel(1996(2)2013 ,labsize(small))
%%stata
sysuse uslifeexp.dta, clear
scatter le_male le_female year if year >= 1950 ///
|| lfit le_male year if year >= 1950 ///
|| lfit le_female year if year >= 1950
%%stata
sysuse uslifeexp.dta, clear
scatter le_male le_female year if year >= 1950 ///
|| lfit le_male year if year >= 1950 ///
|| lfit le_female year if year >= 1950 ///
,title("US Male and Female Life Expectancy, 1950-2000") ///
text(75 1978 "Female", place(3)) ///
text(68 1978 "Male", place(3))
%%stata
sysuse lifeexp.dta, clear
twoway ///
(lfitci lexp safewater if region == 2) /* North America */ ///
(scatter lexp safewater if region == 2) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) order(2 "Linear fit" 1 "95% CI"))
%%stata
sysuse lifeexp.dta, clear
twoway ///
(lfitci lexp safewater if region == 2) /* North America */ ///
(scatter lexp safewater if region == 2, mlabel(country)) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
subtitle("North America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) order(2 "Linear fit" 1 "95% CI"))
%%stata
sysuse lifeexp.dta, clear
generate pos = 12 if country == "Panama"
replace pos = 12 if country == "Honduras"
replace pos = 10 if country == "Cuba"
replace pos = 9 if country == "Jamaica"
replace pos = 9 if country == "El Salvador"
replace pos = 9 if country == "Trinidad and Tobago"
replace pos = 9 if country == "Dominican Republic"
twoway ///
(lfitci lexp safewater if region == 2) /* North America */ ///
(scatter lexp safewater if region == 2 ///
, mlabel(country) mlabvposition(pos)) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
subtitle("North America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) order(2 "Linear fit" 1 "95% CI")) ///
plotregion(margin(r+10))
%%stata
sysuse lifeexp.dta, clear
twoway (scatter lexp safewater if region == 2 | region == 3 ///
,mlabel(country)) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
subtitle("North and South America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
plotregion(margin(r+10))
%%stata
sysuse lifeexp.dta, clear
generate pos = 3
replace pos = 9 if country == "Argentina"
replace pos = 9 if country == "Canada"
replace pos = 9 if country == "Cuba"
replace pos = 9 if country == "Panama"
replace pos = 9 if country == "Venezuela"
replace pos = 9 if country == "Jamaica"
replace pos = 9 if country == "Dominican Republic"
replace pos = 9 if country == "Ecuador"
replace pos = 9 if country == "El Salvador"
replace pos = 12 if country == "Puerto Rico"
twoway ///
(scatter lexp safewater if region == 2 ///
,mlabel(country) mlabvposition(pos)) ///
(scatter lexp safewater if region == 3 ///
,mlabel(country) mlabvposition(pos)) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
subtitle("North and South America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) order(1 "North America" 2 "South America") cols(1))
%%stata
sysuse lifeexp.dta, clear
generate pos = 3
replace pos = 9 if country == "Argentina"
replace pos = 9 if country == "Canada"
replace pos = 9 if country == "Cuba"
replace pos = 9 if country == "Panama"
replace pos = 9 if country == "Venezuela"
replace pos = 9 if country == "Jamaica"
replace pos = 9 if country == "Dominican Republic"
replace pos = 9 if country == "Ecuador"
replace pos = 9 if country == "El Salvador"
replace pos = 12 if country == "Puerto Rico"
twoway ///
(scatter lexp safewater if region == 2 ///
,mlabel(country) mlabvposition(pos) msize(small)) ///
(scatter lexp safewater if region == 3 ///
,mlabel(country) mlabvposition(pos) msize(small) msymbol(circle_hollow)) ///
(lfit lexp safewater if region == 2, clcolor(navy)) ///
(lfit lexp safewater if region == 3, clcolor(maroon)) ///
,title("Life expectancy at birth by access to safe water, 1998") ///
subtitle("North and South America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) cols(1) order(1 "North America" 2 "South America" ///
3 "North America linear fit" 4 "South America linear fit"))
%%stata
sysuse lifeexp.dta, clear
generate pos = 3
replace pos = 9 if country == "Argentina"
replace pos = 9 if country == "Canada"
replace pos = 9 if country == "Cuba"
replace pos = 9 if country == "Panama"
replace pos = 9 if country == "Venezuela"
replace pos = 9 if country == "Jamaica"
replace pos = 9 if country == "Dominican Republic"
replace pos = 9 if country == "Ecuador"
replace pos = 9 if country == "El Salvador"
replace pos = 12 if country == "Puerto Rico"
#delimit ;
twoway ///
(scatter lexp safewater if region == 2 ///
,mlabel(country) mlabvposition(pos) msize(small) mcolor(black) mlabcolor(black)) ///
(scatter lexp safewater if region == 3 ///
,mlabel(country) mlabvposition(pos) msize(small) mcolor(black) mlabcolor(black) ///
msymbol(circle_hollow)) ///
(lfit lexp safewater if region == 2, clcolor(black)) ///
(lfit lexp safewater if region == 3, clcolor(black) clpattern(dash)) ///
,title("Life expectancy at birth by access to safe water, 1998", color(black)) ///
subtitle("North and South America") ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(ring(0) pos(5) cols(1) order(1 "North America" 2 "South America" ///
3 "North America linear fit" 4 "South America linear fit"))
%%stata
sysuse lifeexp.dta, clear
twoway scatter lexp safewater, by(region, total) ///
,ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water")
%%stata
sysuse lifeexp.dta, clear
twoway scatter lexp safewater ///
,by(region,total style(compact) ///
title("Life expectancy by access to safe water") note("")) ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water")
%%stata
sysuse lifeexp.dta, clear
twoway scatter lexp safewater ///
, by(region,total style(compact) ///
title("Life expectancy by access to safe water") note("")) ///
xscale(range(20 100)) ///
xtick(20(10)100) ///
xlabel(30(10)100, labsize(small)) ///
xtitle("Percent of population with access to safe water") ///
ytitle("Life expectancy at birth") ///
ylabel(55(5)80, angle(0))
%%stata
sysuse lifeexp.dta, clear
generate pos = 3
replace pos = 6 if country == "Honduras"
replace pos = 9 if country == "Canada"
replace pos = 9 if country == "Cuba"
replace pos = 9 if country == "Guatemala"
replace pos = 9 if country == "Panama"
replace pos = 9 if country == "Jamaica"
replace pos = 9 if country == "Dominican Republic"
replace pos = 9 if country == "Ecuador"
replace pos = 9 if country == "El Salvador"
replace pos = 12 if country == "Puerto Rico"
twoway ///
(scatter lexp safewater if region == 2, ///
mcolor(black) msize(small) ///
mlabel(country) mlabvposition(pos) mlabcolor(black)) ///
(lfit lexp safewater if region == 2, clcolor(black)) ///
,name(north_america, replace) ///
subtitle("North America", color(black)) ///
ylabel(,angle(0)) ///
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(off)
%%stata
replace pos = 9 if country == "Venezuela"
replace pos = 9 if country == "Argentina"
replace pos = 9 if country == "Ecuador"
twoway ///
(scatter lexp safewater if region == 3, ///
mcolor(black) msize(small) ///
mlabel(country) mlabvposition(pos) mlabcolor(black)) ///
(lfit lexp safewater if region == 3, clcolor(black)) ///
,name(south_america, replace) ///
subtitle("South America", color(black)) ///
ylabel(, angle(0)) ////
ytitle("Life expectancy at birth") ///
xtitle("Percent of population with access to safe water") ///
legend(off)
%%stata
graph combine north_america south_america ///
,title("Life expectancy by access to safe water", color(black)) col(1)
%%stata
graph combine north_america south_america ///
,title("Life expectancy by access to safe water", ///
color(black)) ///
xcommon ycommon ///
xsize(7) ysize(10.5) ///
col(1)
## THE GRAPH BELOW LOOKS ALMOST THE SAME AS THE GRAPH ABOVE, BUT IF YOU PLOT IT IN STATA, IT WILL LOOK MUCH MORE BEAUTIFUL
%%stata
#after installing "net install vgsg" type:
vgcolormap, quietly
#Change marker color in graphs by using the option mcolor()